{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "59d5ff3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('processed-cantonese-radio.json') as fopen:\n",
    "    data = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f74d405a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import random\n",
    "\n",
    "questions = ['can you describe the audio', 'explain about the audio']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "278c18ee",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'audio_filename': 'cantonese-radio-mp3/20201231_彌敦道政交所_224.mp3',\n",
       " 'speech_duration': 27.759375000000002,\n",
       " 'transcript_whisper': '冇得改變嘅噉，我只不過系將呢個現象擺出嚟啫，中共佢寧可放開一啲嘅經濟權，佢都唔會放軍權嘅噉。總結個百年裏面呢呢個系佢嘅一個特點，喇好喇到第十樣喇同香港有關喇系噉。因爲過去中共都利用過香港㗎，其實一百年都利用好多吖，都有九十九年都系噉樣冇變嘅。因爲唔好講中共。'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1209e2fc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████| 1096211/1096211 [00:15<00:00, 69808.52it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "selected = []\n",
    "for i in tqdm(range(len(data))):\n",
    "    \n",
    "    d = None\n",
    "    filename = os.path.join('vllm-cantonese-description', f'{i}.json')\n",
    "    try:\n",
    "        with open(filename) as fopen:\n",
    "            d = json.load(fopen)\n",
    "    except Exception as e:\n",
    "        pass\n",
    "    \n",
    "    filename = os.path.join('vllm-cantonese-description-v2', f'{i}.json')\n",
    "    try:\n",
    "        with open(filename) as fopen:\n",
    "            d = json.load(fopen)\n",
    "    except Exception as e:\n",
    "        pass\n",
    "    \n",
    "    if d is None:\n",
    "        continue\n",
    "        \n",
    "    d = d.replace('<END>', '').replace('<END', '').strip()\n",
    "    \n",
    "    audio_filename = data[i]['audio_filename'].replace('-mp3/', '-mp3-16k/')\n",
    "    if not os.path.exists(audio_filename):\n",
    "        continue\n",
    "    \n",
    "    selected.append({\n",
    "        'question': random.choice(questions),\n",
    "        'answer': d,\n",
    "        'metadata': json.dumps(data[i]),\n",
    "        'audio_filename': audio_filename,\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d2ebc398",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "338813"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(selected)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "81b6c096",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': 'explain about the audio',\n",
       " 'answer': \"In this Cantonese audio clip, the speaker discusses a significant observation about the Chinese Communist Party (CCP) in a matter-of-fact and slightly critical tone. Here’s a breakdown of the key points:\\n\\n1. **Unchangeable Reality**: The speaker starts by emphasizing that what they are discussing is an unchangeable fact (冇得改變嘅噉). This sets the stage for a serious and definitive statement.\\n\\n2. **Economic vs. Military Power**: The speaker notes that the CCP is willing to loosen its grip on economic power (中共佢寧可放開一啲嘅經濟權) but will never relinquish military control (佢都唔會放軍權噉). This highlights the CCP's strategic priorities and the non-negotiable nature of military authority.\\n\\n3. **Historical Context**: The speaker then provides historical context, summarizing that over the past century, this has been a consistent characteristic of the CCP (總結個百年裏面呢呢個系佢嘅一個特點). This reinforces the idea that the CCP's behavior is deeply rooted in its history and ideology.\\n\\n4. **Hong Kong's Role**: The conversation shifts to Hong Kong, noting that it is the tenth point in the discussion (喇好喇到第十樣喇同香港有關喇系噉). The speaker explains that the CCP has historically used Hong Kong to its advantage (因爲過去中共都利用過香港㗎), and this practice has been ongoing for nearly a century (其實一百年都利用好多吖，都有九十九年都系噉樣冇變嘅).\\n\\n5. **Subtle Criticism**: The speaker concludes with a somewhat sarcastic note, suggesting that the CCP's use of Hong Kong is a well-known fact (因爲唔好講中共), implying that it's unnecessary to elaborate further because it's so obvious.\\n\\nOverall, the tone is straightforward and slightly critical, blending factual observations with a touch of sarcasm. The speaker aims to highlight the CCP's consistent behavior over time, particularly in relation to economic and military power, and its long-standing use of Hong Kong. The language is clear and direct, making the points easy to understand while also conveying a sense of frustration or disappointment with the CCP's actions.\",\n",
       " 'metadata': '{\"audio_filename\": \"cantonese-radio-mp3/20201231_\\\\u5f4c\\\\u6566\\\\u9053\\\\u653f\\\\u4ea4\\\\u6240_224.mp3\", \"speech_duration\": 27.759375000000002, \"transcript_whisper\": \"\\\\u5187\\\\u5f97\\\\u6539\\\\u8b8a\\\\u5605\\\\u5649\\\\uff0c\\\\u6211\\\\u53ea\\\\u4e0d\\\\u904e\\\\u7cfb\\\\u5c07\\\\u5462\\\\u500b\\\\u73fe\\\\u8c61\\\\u64fa\\\\u51fa\\\\u569f\\\\u556b\\\\uff0c\\\\u4e2d\\\\u5171\\\\u4f62\\\\u5be7\\\\u53ef\\\\u653e\\\\u958b\\\\u4e00\\\\u5572\\\\u5605\\\\u7d93\\\\u6fdf\\\\u6b0a\\\\uff0c\\\\u4f62\\\\u90fd\\\\u5514\\\\u6703\\\\u653e\\\\u8ecd\\\\u6b0a\\\\u5605\\\\u5649\\\\u3002\\\\u7e3d\\\\u7d50\\\\u500b\\\\u767e\\\\u5e74\\\\u88cf\\\\u9762\\\\u5462\\\\u5462\\\\u500b\\\\u7cfb\\\\u4f62\\\\u5605\\\\u4e00\\\\u500b\\\\u7279\\\\u9ede\\\\uff0c\\\\u5587\\\\u597d\\\\u5587\\\\u5230\\\\u7b2c\\\\u5341\\\\u6a23\\\\u5587\\\\u540c\\\\u9999\\\\u6e2f\\\\u6709\\\\u95dc\\\\u5587\\\\u7cfb\\\\u5649\\\\u3002\\\\u56e0\\\\u7232\\\\u904e\\\\u53bb\\\\u4e2d\\\\u5171\\\\u90fd\\\\u5229\\\\u7528\\\\u904e\\\\u9999\\\\u6e2f\\\\u35ce\\\\uff0c\\\\u5176\\\\u5be6\\\\u4e00\\\\u767e\\\\u5e74\\\\u90fd\\\\u5229\\\\u7528\\\\u597d\\\\u591a\\\\u5416\\\\uff0c\\\\u90fd\\\\u6709\\\\u4e5d\\\\u5341\\\\u4e5d\\\\u5e74\\\\u90fd\\\\u7cfb\\\\u5649\\\\u6a23\\\\u5187\\\\u8b8a\\\\u5605\\\\u3002\\\\u56e0\\\\u7232\\\\u5514\\\\u597d\\\\u8b1b\\\\u4e2d\\\\u5171\\\\u3002\"}',\n",
       " 'audio_filename': 'cantonese-radio-mp3-16k/20201231_彌敦道政交所_224.mp3'}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b64c8695",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "dataset = Dataset.from_list(selected)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "16b16711",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "25404a4438574b4e9b56f913d1bb7f0c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "be77e52dd81d4e0d81b812b36b1e4709",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/170 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c3319d905c7244b29a9f774bc6066c9a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/234M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8d8f13e57cdf438aabf91b7708cdb8a7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/170 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b78f508f855a4720bcee6dfcd0a4584f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/232M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5f450eff9b874270938253a5439b79b3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/882 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Cantonese-Radio-Description-Instructions/commit/92f6229c6c19a367841f566784336ad987aad4eb', commit_message='Upload dataset', commit_description='', oid='92f6229c6c19a367841f566784336ad987aad4eb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Cantonese-Radio-Description-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Cantonese-Radio-Description-Instructions'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.push_to_hub('mesolitica/Cantonese-Radio-Description-Instructions')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
